{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 融合core与qmof\n",
    "本note旨在融合core与qmof数据集，进行数据的初步处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def dropun(X):\n",
    "    for x in X.columns:\n",
    "        if x[:7]=='Unnamed':\n",
    "              X=X.drop(columns=[x])\n",
    "    return X"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读入core数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10140, 6)\n"
     ]
    }
   ],
   "source": [
    "core = dropun(pd.read_csv(\"CoRE_database/CoRE_MOFs.csv\"))\n",
    "print(core.shape)\n",
    "\n",
    "# core.columns=['name']\n",
    "core_name = list(core['name'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "415 0.040927021696252466\n"
     ]
    }
   ],
   "source": [
    "cnt = 0\n",
    "for mof in core_name:\n",
    "    if mof[0]<'A' or mof[0]>'Z':\n",
    "        cnt +=1\n",
    "print(cnt, cnt/len(core_name))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读入qmof数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(18321, 21)\n"
     ]
    }
   ],
   "source": [
    "qmof = dropun(pd.read_csv(\"qmof_database/qmof.csv\"))\n",
    "\n",
    "print(qmof.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>info.source</th>\n",
       "      <th>info.formula</th>\n",
       "      <th>info.natoms</th>\n",
       "      <th>info.pld</th>\n",
       "      <th>info.lcd</th>\n",
       "      <th>info.density</th>\n",
       "      <th>inputs.pbe.theory</th>\n",
       "      <th>inputs.pbe.pseudopotentials</th>\n",
       "      <th>inputs.pbe.encut</th>\n",
       "      <th>...</th>\n",
       "      <th>inputs.pbe.gamma</th>\n",
       "      <th>inputs.pbe.spin</th>\n",
       "      <th>outputs.pbe.energy_total</th>\n",
       "      <th>outputs.pbe.energy_vdw</th>\n",
       "      <th>outputs.pbe.energy_elec</th>\n",
       "      <th>outputs.pbe.bandgap</th>\n",
       "      <th>outputs.pbe.directgap</th>\n",
       "      <th>outputs.pbe.cbm</th>\n",
       "      <th>outputs.pbe.vbm</th>\n",
       "      <th>outputs.pbe.net_magmom</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>...</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18319</td>\n",
       "      <td>18319</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>...</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "      <td>18321</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         name info.source info.formula info.natoms info.pld info.lcd  \\\n",
       "count   18321       18321        18321       18321    18321    18321   \n",
       "unique      1           1            1           1        2        2   \n",
       "top     False       False        False       False    False    False   \n",
       "freq    18321       18321        18321       18321    18319    18319   \n",
       "\n",
       "       info.density inputs.pbe.theory inputs.pbe.pseudopotentials  \\\n",
       "count         18321             18321                       18321   \n",
       "unique            1                 1                           1   \n",
       "top           False             False                       False   \n",
       "freq          18321             18321                       18321   \n",
       "\n",
       "       inputs.pbe.encut  ... inputs.pbe.gamma inputs.pbe.spin  \\\n",
       "count             18321  ...            18321           18321   \n",
       "unique                1  ...                1               1   \n",
       "top               False  ...            False           False   \n",
       "freq              18321  ...            18321           18321   \n",
       "\n",
       "       outputs.pbe.energy_total outputs.pbe.energy_vdw  \\\n",
       "count                     18321                  18321   \n",
       "unique                        1                      1   \n",
       "top                       False                  False   \n",
       "freq                      18321                  18321   \n",
       "\n",
       "       outputs.pbe.energy_elec outputs.pbe.bandgap outputs.pbe.directgap  \\\n",
       "count                    18321               18321                 18321   \n",
       "unique                       1                   1                     1   \n",
       "top                      False               False                 False   \n",
       "freq                     18321               18321                 18321   \n",
       "\n",
       "       outputs.pbe.cbm outputs.pbe.vbm outputs.pbe.net_magmom  \n",
       "count            18321           18321                  18321  \n",
       "unique               1               1                      1  \n",
       "top              False           False                  False  \n",
       "freq             18321           18321                  18321  \n",
       "\n",
       "[4 rows x 21 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qmof.isnull().describe()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(qmof.shape[0]):\n",
    "    if qmof.loc[i, 'name'].endswith('_FSR'):\n",
    "        qmof.loc[i, 'name'] = qmof.loc[i, 'name'][:-4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'ABAVIJ' in qmof['name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "core_qmof = pd.merge(core, qmof, how='inner', left_on='name', right_on='name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Deposition Number(s)</th>\n",
       "      <th>info.natoms</th>\n",
       "      <th>info.pld</th>\n",
       "      <th>info.lcd</th>\n",
       "      <th>info.density</th>\n",
       "      <th>inputs.pbe.encut</th>\n",
       "      <th>outputs.pbe.energy_total</th>\n",
       "      <th>outputs.pbe.energy_vdw</th>\n",
       "      <th>outputs.pbe.energy_elec</th>\n",
       "      <th>outputs.pbe.bandgap</th>\n",
       "      <th>outputs.pbe.cbm</th>\n",
       "      <th>outputs.pbe.vbm</th>\n",
       "      <th>outputs.pbe.net_magmom</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1.231000e+03</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.0</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.000000</td>\n",
       "      <td>1294.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>8.695718e+05</td>\n",
       "      <td>160.122102</td>\n",
       "      <td>3.008050</td>\n",
       "      <td>4.371541</td>\n",
       "      <td>1.662960</td>\n",
       "      <td>520.0</td>\n",
       "      <td>-1076.720700</td>\n",
       "      <td>-11.807468</td>\n",
       "      <td>-1064.913231</td>\n",
       "      <td>2.107698</td>\n",
       "      <td>1.382659</td>\n",
       "      <td>-0.725039</td>\n",
       "      <td>3.042087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>3.027292e+05</td>\n",
       "      <td>92.481359</td>\n",
       "      <td>1.833792</td>\n",
       "      <td>1.852618</td>\n",
       "      <td>0.449311</td>\n",
       "      <td>0.0</td>\n",
       "      <td>617.950760</td>\n",
       "      <td>6.705643</td>\n",
       "      <td>611.423905</td>\n",
       "      <td>1.225390</td>\n",
       "      <td>1.135637</td>\n",
       "      <td>1.008547</td>\n",
       "      <td>8.542077</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.013380e+05</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>0.583490</td>\n",
       "      <td>1.231010</td>\n",
       "      <td>0.475276</td>\n",
       "      <td>520.0</td>\n",
       "      <td>-3449.382160</td>\n",
       "      <td>-37.828250</td>\n",
       "      <td>-3413.768901</td>\n",
       "      <td>0.003537</td>\n",
       "      <td>-4.315972</td>\n",
       "      <td>-4.722541</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>7.333245e+05</td>\n",
       "      <td>94.000000</td>\n",
       "      <td>1.672273</td>\n",
       "      <td>3.171125</td>\n",
       "      <td>1.339884</td>\n",
       "      <td>520.0</td>\n",
       "      <td>-1390.554230</td>\n",
       "      <td>-14.796610</td>\n",
       "      <td>-1374.470651</td>\n",
       "      <td>0.931663</td>\n",
       "      <td>0.621968</td>\n",
       "      <td>-1.381533</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>8.751770e+05</td>\n",
       "      <td>136.000000</td>\n",
       "      <td>2.698560</td>\n",
       "      <td>4.129870</td>\n",
       "      <td>1.581588</td>\n",
       "      <td>520.0</td>\n",
       "      <td>-932.175776</td>\n",
       "      <td>-10.318445</td>\n",
       "      <td>-921.094991</td>\n",
       "      <td>2.275854</td>\n",
       "      <td>1.420754</td>\n",
       "      <td>-0.741766</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>9.980655e+05</td>\n",
       "      <td>208.000000</td>\n",
       "      <td>3.743205</td>\n",
       "      <td>5.039013</td>\n",
       "      <td>1.927021</td>\n",
       "      <td>520.0</td>\n",
       "      <td>-629.457388</td>\n",
       "      <td>-7.112253</td>\n",
       "      <td>-621.970803</td>\n",
       "      <td>3.037810</td>\n",
       "      <td>2.152122</td>\n",
       "      <td>-0.098097</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.542564e+06</td>\n",
       "      <td>500.000000</td>\n",
       "      <td>16.924860</td>\n",
       "      <td>16.989590</td>\n",
       "      <td>4.051389</td>\n",
       "      <td>520.0</td>\n",
       "      <td>-140.843654</td>\n",
       "      <td>-1.669500</td>\n",
       "      <td>-138.669684</td>\n",
       "      <td>5.306719</td>\n",
       "      <td>5.139415</td>\n",
       "      <td>2.687465</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Deposition Number(s)  info.natoms     info.pld     info.lcd  \\\n",
       "count          1.231000e+03  1294.000000  1294.000000  1294.000000   \n",
       "mean           8.695718e+05   160.122102     3.008050     4.371541   \n",
       "std            3.027292e+05    92.481359     1.833792     1.852618   \n",
       "min            1.013380e+05    24.000000     0.583490     1.231010   \n",
       "25%            7.333245e+05    94.000000     1.672273     3.171125   \n",
       "50%            8.751770e+05   136.000000     2.698560     4.129870   \n",
       "75%            9.980655e+05   208.000000     3.743205     5.039013   \n",
       "max            1.542564e+06   500.000000    16.924860    16.989590   \n",
       "\n",
       "       info.density  inputs.pbe.encut  outputs.pbe.energy_total  \\\n",
       "count   1294.000000            1294.0               1294.000000   \n",
       "mean       1.662960             520.0              -1076.720700   \n",
       "std        0.449311               0.0                617.950760   \n",
       "min        0.475276             520.0              -3449.382160   \n",
       "25%        1.339884             520.0              -1390.554230   \n",
       "50%        1.581588             520.0               -932.175776   \n",
       "75%        1.927021             520.0               -629.457388   \n",
       "max        4.051389             520.0               -140.843654   \n",
       "\n",
       "       outputs.pbe.energy_vdw  outputs.pbe.energy_elec  outputs.pbe.bandgap  \\\n",
       "count             1294.000000              1294.000000          1294.000000   \n",
       "mean               -11.807468             -1064.913231             2.107698   \n",
       "std                  6.705643               611.423905             1.225390   \n",
       "min                -37.828250             -3413.768901             0.003537   \n",
       "25%                -14.796610             -1374.470651             0.931663   \n",
       "50%                -10.318445              -921.094991             2.275854   \n",
       "75%                 -7.112253              -621.970803             3.037810   \n",
       "max                 -1.669500              -138.669684             5.306719   \n",
       "\n",
       "       outputs.pbe.cbm  outputs.pbe.vbm  outputs.pbe.net_magmom  \n",
       "count      1294.000000      1294.000000             1294.000000  \n",
       "mean          1.382659        -0.725039                3.042087  \n",
       "std           1.135637         1.008547                8.542077  \n",
       "min          -4.315972        -4.722541                0.000000  \n",
       "25%           0.621968        -1.381533                0.000000  \n",
       "50%           1.420754        -0.741766                0.000000  \n",
       "75%           2.152122        -0.098097                0.000000  \n",
       "max           5.139415         2.687465              100.000000  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "core_qmof.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "core_qmof.to_csv('core_qmof.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以上融合的结果为1294个，保存在core_qmof中"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 导出core_qmof链接\n",
    "\n",
    "下载后数据集pdf名称对照表被存在core_qmof_pdf中，doi的最后一栏就是split（/）之后取str[4]\n",
    "\n",
    "由于core_qmof_pdf是core_qmof的简化版，导出core_qmof_link之后需要在此与core_qmof_pdf联合\n",
    "\n",
    "注意：\n",
    "1. **doi及文件名要统一大写转小写**\n",
    "2. **doi及文件名要统一将%转成正常字符！**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_filename(name):\n",
    "    return name.lower().replace('%28', '(').replace('%29', ')').replace('%3b', ';').replace('%3a', ':')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "core_qmof_word\n",
      "core_qmof_html_dest\n",
      "core_qmof_pdf\n",
      "core_qmof_html\n",
      "core_qmof_txt_word\n",
      "core_qmof_txt_original\n",
      "core_qmof_txt_pdfminer\n",
      "core_qmof_txt\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "def rename (path):\n",
    "    filenames = os.listdir(path)\n",
    "    for filename in filenames:\n",
    "        lowerfilename = get_filename(filename)\n",
    "        file = os.path.join(path, filename)\n",
    "        new_file = os.path.join(path, lowerfilename)\n",
    "        if os.path.isdir(file):\n",
    "            rename(file)\n",
    "        else:\n",
    "            os.rename(file, new_file)\n",
    "    \n",
    "for directory in os.listdir('../convert/'):\n",
    "    if os.path.isdir('../convert/' + directory) and directory.startswith('core_qmof'):\n",
    "        print(directory)\n",
    "        rename('../convert/' + directory)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "core_qmof_pdf = dropun(pd.read_csv('core_qmof_pdf.csv'))  # 丢失了数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![image-20210802190517360](https://gitee.com/sun__ye/gallery/raw/master/img2/20210802190542.png)\n",
    "\n",
    "![image-20210802190353879](https://gitee.com/sun__ye/gallery/raw/master/img2/20210802190550.png)\n",
    "\n",
    "![image-20210802190409676](https://gitee.com/sun__ye/gallery/raw/master/img2/20210802190552.png)\n",
    "\n",
    "df自动将 `%28`转为`(`，自动将`%29`转为`)`，而文件名却不会自动转换。因此再通过dataframe寻找对应的文件时找不到。然而后续文件名中只要带有%的文件，zdir都无法访问，这里必须还是按照dataframe的方式。将doi和另存为的文件名都转一下"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(sici)1521-3773(19990917)38:18%3c2730::aid-anie2730%3e3.3.co;2-0\n",
      "1521-3749(200102)627:2%3c131::aid-zaac131%3e3.3.co;2-r\n",
      "(sici)1521-3773(19990115)38:1\n"
     ]
    }
   ],
   "source": [
    "untrace = pd.DataFrame(columns=['name', 'doi_link'])\n",
    "os.system('rm -rf ../convert/untraced_pdf')\n",
    "\n",
    "for i in range(core_qmof_pdf.shape[0]):\n",
    "#     print(core_qmof_pdf.loc[i,'doi_link'])\n",
    "    if core_qmof_pdf.loc[i,'doi_link'] == 'not_exist':\n",
    "        core_qmof_pdf.loc[i,'doi'] = 'not_exist'\n",
    "        continue\n",
    "        \n",
    "    doi = core_qmof_pdf.loc[i,'doi_link'].split('/')[4]\n",
    "\n",
    "    doi = get_filename(doi)\n",
    "    core_qmof_pdf.loc[i,'doi'] = doi\n",
    "    if os.path.exists('../convert/core_qmof_pdf/' + doi + '.pdf'):\n",
    "        core_qmof_pdf.loc[i,'pdf_link'] = 'http://d.sunie.top:9009/material/convert/core_qmof_pdf/' + doi + '.pdf'\n",
    "    else:\n",
    "        core_qmof_pdf.loc[i,'pdf_link'] = 'not_exist'\n",
    "        untrace.loc[untrace.shape[0]]={'name': core_qmof_pdf.loc[i, 'name'], 'doi_link': core_qmof_pdf.loc[i,'doi_link']}\n",
    "        print(doi)\n",
    "\n",
    "    if os.path.exists('../convert/core_qmof_txt/' + doi + '.txt'):\n",
    "        # 这里不需要后缀名！！\n",
    "        core_qmof_pdf.loc[i, 'brat_link'] = 'http://d.sunie.top:9020/index.xhtml#/material/convert/core_qmof_txt/' + doi\n",
    "    else:\n",
    "        core_qmof_pdf.loc[i, 'brat_link'] = 'not_exist'\n",
    "        \n",
    "    if os.path.exists('../convert/core_qmof_txt_pdfminer/' + doi + '.txt'):\n",
    "        core_qmof_pdf.loc[i, 'pdfminer_link'] = 'http://d.sunie.top:9009/material/convert/core_qmof_txt_pdfminer/' + doi + '.txt'\n",
    "    else:\n",
    "        core_qmof_pdf.loc[i, 'pdfminer_link'] = 'not_exist'\n",
    "    \n",
    "    if os.path.exists('../convert/core_qmof_html/' + doi + '.html'):\n",
    "        core_qmof_pdf.loc[i,'html_link'] = 'http://d.sunie.top:9009/material/convert/core_qmof_html/' + doi + '.html'\n",
    "    else:\n",
    "        core_qmof_pdf.loc[i,'html_link'] = 'not_exist'\n",
    "        \n",
    "        # 记录未转换的文件\n",
    "        os.makedirs('../convert/untraced_pdf', exist_ok=True)\n",
    "        os.system(f'cp ../convert/core_qmof_pdf/{doi}.pdf ../convert/untraced_pdf/{doi}.pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "untrace.to_csv('untrace.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "core_qmof_pdf = core_qmof_pdf.drop(columns=['doi_link'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "core_qmof_link = pd.merge(core_qmof_pdf, core_qmof, how='inner', left_on='name', right_on='name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "core_qmof_link.to_csv('core_qmof_link.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "关于core_qmof_pdf有效数据的长度，必须排除`nan`在外！"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "891"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(core_qmof_pdf['doi']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 筛选出论文材料1:1的数据\n",
    "\n",
    "1. 寻找只被指向过一次的论文\n",
    "2. 根据这些论文寻找对应的材料，并添加"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "paper = {}\n",
    "\n",
    "for i in range(len(core_qmof_link)):\n",
    "    name = core_qmof_link.loc[i, 'name']\n",
    "    doi_link = core_qmof_link.loc[i, 'doi_link']\n",
    "    if doi_link!=\"not_exist\":\n",
    "        if doi_link not in paper.keys():\n",
    "            paper[doi_link] = 0\n",
    "        paper[doi_link] += 1\n",
    "\n",
    "paper = [k for k,v in paper.items() if v==1 ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "drop_row = []\n",
    "for i in range(len(core_qmof_link)):\n",
    "    if core_qmof_link.loc[i, 'doi_link'] not in paper:\n",
    "        drop_row.append(i)\n",
    "        \n",
    "core_qmof704 = core_qmof_link.drop(drop_row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "core_qmof704 = core_qmof704.reset_index().drop(columns=['index'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "core_qmof704.to_csv('core_qmof704.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "702"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(core_qmof704['pdf_link']))   # 实际pdf数量"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 删除转换较差的html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-14T09:24:43.483392Z",
     "start_time": "2021-10-14T09:24:43.478237Z"
    }
   },
   "outputs": [],
   "source": [
    "bad_doi = \"\"\"ic802445q\n",
    "chem.201002088\n",
    "acs.cgd.5b00681\n",
    "c0dt00931h\n",
    "c0dt00999g\n",
    "acs.inorgchem.5b02340\n",
    "c6cc03285k\n",
    "chem.200305098\n",
    "acs.cgd.6b00890\n",
    "adfm.200800058\n",
    "ic00136a027\n",
    "chem.201502033\n",
    "ajchem.2014.16986\n",
    "jssc.2001.9118\n",
    "b200213b\n",
    "c1dt10901d\n",
    "s0108768101020717\n",
    "ejic.201200686\n",
    "j.solidstatesciences.2004.03.007\n",
    "b809557d\n",
    "chem.200801948\n",
    "\"\"\"\n",
    "\n",
    "bad_doi = bad_doi.split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-10-14T09:31:43.480069Z",
     "start_time": "2021-10-14T09:31:43.474320Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "dirname = '../convert/core_qmof_html'\n",
    "\n",
    "for f in os.listdir(dirname):\n",
    "    if f.endswith('.html'):\n",
    "        if f[:-5].endswith('(2)'):\n",
    "            print(f)\n",
    "            os.remove(f'{dirname}/{f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "for doi in bad_doi.split():\n",
    "    path = f'{dirname}/{doi}.html'\n",
    "    os.remove(path)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "7383003b210fdacca9bf7683d9d1d561f4a72c77adad40daede406a89507eb7d"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "165px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
